import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
import plotly.io as pio
%matplotlib inline
pio.renderers.default = "plotly_mimetype+notebook"
df = pd.read_csv('gapminder_clean.csv')
df.describe
<bound method NDFrame.describe of Unnamed: 0 Country Name Year Agriculture, value added (% of GDP) \
0 0 Afghanistan 1962 NaN
1 1 Afghanistan 1967 NaN
2 2 Afghanistan 1972 NaN
3 3 Afghanistan 1977 NaN
4 4 Afghanistan 1982 NaN
... ... ... ... ...
2602 2602 Zimbabwe 1987 14.407528
2603 2603 Zimbabwe 1992 7.413793
2604 2604 Zimbabwe 1997 18.934082
2605 2605 Zimbabwe 2002 14.029007
2606 2606 Zimbabwe 2007 21.597907
CO2 emissions (metric tons per capita) \
0 0.073781
1 0.123782
2 0.130820
3 0.183118
4 0.165879
... ...
2602 1.598217
2603 1.533724
2604 1.194678
2605 0.942795
2606 0.743449
Domestic credit provided by financial sector (% of GDP) \
0 21.276422
1 9.917662
2 18.880833
3 13.836822
4 NaN
... ...
2602 74.161607
2603 43.120518
2604 63.058320
2605 164.559047
2606 NaN
Electric power consumption (kWh per capita) \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
2602 878.072691
2603 778.695133
2604 870.988697
2605 827.329873
2606 700.642317
Energy use (kg of oil equivalent per capita) \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
2602 896.673611
2603 923.493407
2604 804.508892
2605 772.676619
2606 725.681049
Exports of goods and services (% of GDP) \
0 4.878051
1 6.772908
2 14.763231
3 11.662904
4 NaN
... ...
2602 24.015710
2603 27.227263
2604 37.595273
2605 31.834799
2606 37.785373
Fertility rate, total (births per woman) GDP growth (annual %) \
0 7.450 NaN
1 7.450 NaN
2 7.450 NaN
3 7.449 NaN
4 7.450 NaN
... ... ...
2602 5.784 1.150737
2603 4.840 -9.015570
2604 4.237 2.680594
2605 4.018 -8.894023
2606 4.022 -3.653327
Imports of goods and services (% of GDP) \
0 9.349593
1 14.209827
2 18.105850
3 14.823175
4 NaN
... ...
2602 21.274886
2603 36.485231
2604 44.609791
2605 34.972553
2606 46.387531
Industry, value added (% of GDP) Inflation, GDP deflator (annual %) \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
2602 32.451242 7.189361
2603 40.862069 -14.129659
2604 25.554678 -2.879048
2605 NaN 2.712950
2606 33.074953 0.894887
Life expectancy at birth, total (years) \
0 33.219902
1 35.389415
2 37.610146
3 40.110146
4 43.230732
... ...
2602 61.753805
2603 56.491976
2604 46.065902
2605 40.679146
2606 44.177756
Population density (people per sq. km of land area) \
0 14.312061
1 15.881812
2 17.947027
3 19.998926
4 19.402324
... ...
2602 24.649495
2603 28.485762
2604 31.174507
2605 32.807111
2606 34.374559
Services, etc., value added (% of GDP) pop continent gdpPercap
0 NaN 10267083.0 Asia 853.100710
1 NaN 11537966.0 Asia 836.197138
2 NaN 13079460.0 Asia 739.981106
3 NaN 14880372.0 Asia 786.113360
4 NaN 12881816.0 Asia 978.011439
... ... ... ... ...
2602 53.141236 9216418.0 Africa 706.157306
2603 51.724135 10704340.0 Africa 693.420786
2604 55.511236 11404948.0 Africa 792.449960
2605 NaN 11926563.0 Africa 672.038623
2606 45.327139 12311143.0 Africa 469.709298
[2607 rows x 20 columns]>
df.head()
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 1 | 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 2 | 2 | Afghanistan | 1972 | NaN | 0.130820 | 18.880833 | NaN | NaN | 14.763231 | 7.450 | NaN | 18.105850 | NaN | NaN | 37.610146 | 17.947027 | NaN | 13079460.0 | Asia | 739.981106 |
| 3 | 3 | Afghanistan | 1977 | NaN | 0.183118 | 13.836822 | NaN | NaN | 11.662904 | 7.449 | NaN | 14.823175 | NaN | NaN | 40.110146 | 19.998926 | NaN | 14880372.0 | Asia | 786.113360 |
| 4 | 4 | Afghanistan | 1982 | NaN | 0.165879 | NaN | NaN | NaN | NaN | 7.450 | NaN | NaN | NaN | NaN | 43.230732 | 19.402324 | NaN | 12881816.0 | Asia | 978.011439 |
#Filter the data to include only rows where Year is 1962.
df_1962 = df[df['Year'] == 1962]
#make a scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap for the filtered data.
plt.scatter(df_1962['CO2 emissions (metric tons per capita)'], df_1962['gdpPercap'])
plt.title('CO2 Emissions vs. GDP per capita (Year 1962)')
plt.xlabel('CO2 emissions (metric tons per capita)')
plt.ylabel('GDP per capita')
plt.show()
#On the filtered data, calculate the correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap.
df_1962.dropna(subset=['CO2 emissions (metric tons per capita)', 'gdpPercap'], inplace=True)
corr, p_value = stats.pearsonr(df_1962['CO2 emissions (metric tons per capita)'], df_1962['gdpPercap'])
print("Correlation:", corr)
print("P-value:", p_value)
Correlation: 0.9260816725019473 P-value: 1.1286792210036883e-46
/var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_27897/1983806514.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#On the unfiltered data, answer "In what year is the correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?"
corr_by_year = df.groupby('Year').corr(method='pearson')
corr_col = corr_by_year['CO2 emissions (metric tons per capita)'].dropna().abs().idxmax()
strongest_corr_year = corr_by_year.loc[corr_col].name
print(strongest_corr_year)
(1962, 'CO2 emissions (metric tons per capita)')
/var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_27897/3760653074.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.corr is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
# Filter the dataset to that year
df_strongest_corr_year = df[df['Year'] == 1962]
df_strongest_corr_year.head(5)
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 10 | 10 | Albania | 1962 | NaN | 1.439956 | NaN | NaN | NaN | NaN | 6.282 | NaN | NaN | NaN | NaN | 64.162854 | 62.456898 | NaN | 1728137.0 | Europe | 2312.888958 |
| 20 | 20 | Algeria | 1962 | NaN | 0.484954 | NaN | NaN | NaN | 19.793873 | 7.614 | -19.685042 | 20.818647 | NaN | 2.351279 | 47.045000 | 4.908240 | NaN | 11000948.0 | Africa | 2550.816880 |
| 30 | 30 | American Samoa | 1962 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.590000 | NaN | NaN | NaN | NaN |
| 40 | 40 | Andorra | 1962 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 32.714894 | NaN | NaN | NaN | NaN |
df_scatter = df_strongest_corr_year[~df_strongest_corr_year['pop'].isnull()]
df_scatter.head(5)
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 10 | 10 | Albania | 1962 | NaN | 1.439956 | NaN | NaN | NaN | NaN | 6.282 | NaN | NaN | NaN | NaN | 64.162854 | 62.456898 | NaN | 1728137.0 | Europe | 2312.888958 |
| 20 | 20 | Algeria | 1962 | NaN | 0.484954 | NaN | NaN | NaN | 19.793873 | 7.614 | -19.685042 | 20.818647 | NaN | 2.351279 | 47.045000 | 4.908240 | NaN | 11000948.0 | Africa | 2550.816880 |
| 50 | 50 | Angola | 1962 | NaN | 0.216025 | NaN | NaN | NaN | NaN | 7.396 | NaN | NaN | NaN | NaN | 33.787585 | 4.384299 | NaN | 4826015.0 | Africa | 4269.276742 |
| 80 | 80 | Argentina | 1962 | NaN | 2.522392 | 17.265977 | NaN | NaN | 4.691843 | 3.089 | -0.852022 | 9.383683 | NaN | 28.871842 | 65.432610 | 7.778624 | NaN | 21283783.0 | Americas | 7133.166023 |
continent = df_scatter['continent'].unique()
continent
array(['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'], dtype=object)
#create an interactive scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent.
fig = px.scatter(df_scatter, x="CO2 emissions (metric tons per capita)", y="gdpPercap", size="pop", color="continent")
fig.show()
Question1:What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)'?
Answer: We used boxplots to visually represent the distribution of energy use across different continents.To determine whether there were statistically significant differences in energy use between the continents, we employed an ANOVA test, ANOVA was chosen because it is suitable for situations where there are multiple groups being compared, in this case we compared energy use across continents with more than two groups (e.g., Africa, Asia, Europe, etc.). The results of the analysis revealed that there are indeed significant differences in energy use among the continents. This suggests that the continent factor has a significant effect on energy use, as the amount of energy used per capita varies significantly across different continents.
# boxplot
sns.boxplot(data=df,x='continent',y='Energy use (kg of oil equivalent per capita)')
plt.title('Energy use by continent')
plt.ylabel('Energy use (kg of oil equivalent per capita)')
plt.xlabel('continent')
# ANOVA test
df_1 = df[~df['continent'].isnull()]
df_1 = df_1[~df_1['Energy use (kg of oil equivalent per capita)'].isnull()]
df_1.head(5)
fval, pval = stats.f_oneway(df_1[df_1['continent'] == 'Africa']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Americas']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Asia']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Europe']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Oceania']['Energy use (kg of oil equivalent per capita)'])
print(f'ANOVA F-value: {fval}')
print(f'ANOVA p-value: {pval}')
ANOVA F-value: 51.45915759042634 ANOVA p-value: 8.527003487154367e-39
Question2:Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990? (Stats test needed)
To compare the 'Imports of goods and services (% of GDP)' between Europe and Asia in the years after 1990, we created a line plot to visually represent the trend over time for both continents. Additionally, a t-test was used to determine if there is a statistically significant difference in imports between Europe and Asia. T-test is typically used when comparing two groups on a single variable. The results of the analysis revealed that there are no significant differences in Imports of goods and services (% of GDP)' between Europe and Asia in the years after 1990.
# line plots
df_Europe = df_1[(df_1['continent'] == 'Europe') & (df_1['Year'] > 1990) & ~df_1['Imports of goods and services (% of GDP)'].isnull()]
df_Asia = df_1[(df_1['continent'] == 'Asia') & (df_1['Year'] >1990) & ~df_1['Imports of goods and services (% of GDP)'].isnull()]
plt.plot(df_Europe['Year'],df_Europe['Imports of goods and services (% of GDP)'], label = 'Europe')
plt.plot(df_Asia['Year'],df_Asia['Imports of goods and services (% of GDP)'], label = 'Asia')
plt.title('Imports of goods and services (% of GDP) in Europe and Asia after 1990')
plt.xlabel('Year')
plt.ylabel('Imports of goods and services (% of GDP)')
plt.legend()
# t-test
tstat,pval = stats.ttest_ind(df_Europe['Imports of goods and services (% of GDP)'],
df_Asia['Imports of goods and services (% of GDP)'])
print(f'T-test statistics:{tstat}')
print(f'T-test p-value:{pval}')
T-test statistics:-1.0979607386870058 T-test p-value:0.2735222291887188
Question3: What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
Answer: To find the country (or countries) with the highest 'Population density (people per sq. km of land area)' across all years, we can create a bar chart to visualize the average ranking of population density for each country across all time points in the dataset. Monaco and Macao SAR, China have the highest 'Population density (people per sq. km of land area)' across all years.
# calculate highest 'Population density'
df_rank = df[~df['Population density (people per sq. km of land area)'].isnull()]
df_rank = df_rank[['Year', 'Country Name', 'Population density (people per sq. km of land area)']]
df_rank['rank'] = df_rank.groupby('Year')['Population density (people per sq. km of land area)'].rank()
df_rank_country = df_rank[['Country Name', 'rank']]
df_rank_country = df_rank_country.groupby('Country Name').mean().reset_index()
df_rank_country= df_rank_country.sort_values('rank')
df_rank_country.columns = ['country', 'rank']
df_rank_country.head()
fig = px.bar(data_frame=df_rank_country, x = 'country', y ='rank')
fig.show()
highest_density = df_rank_country['rank'].max()
print(f'highest rank: {highest_density}')
highest_countries = (df_rank_country.loc[df_rank_country['rank']==highest_density])['country'].values.tolist()
print(f'country (or countries) that has the highest density across all years: {highest_countries}')
highest rank: 255.3 country (or countries) that has the highest density across all years: ['Monaco', 'Macao SAR, China']
Question4: What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007?
Answer: To find the country (or countries) that has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007, we can calculate the difference in life expectancy between 2007 and 1962 for each country, and then sort the countries by this difference in descending order to identify the countries with the greatest increase. Maldives has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007.
#calculate difference in life expectancy between 2007 and 1962 for each country and sort the countries by this difference in descending order
df_lei = df[df['Year'].isin([2007, 1962])]
df_lei = df_lei[['Country Name', 'Year', 'Life expectancy at birth, total (years)']]
df_lei.columns = ['country', 'year', 'le']
df_lei.head()
| country | year | le | |
|---|---|---|---|
| 0 | Afghanistan | 1962 | 33.219902 |
| 9 | Afghanistan | 2007 | 57.833829 |
| 10 | Albania | 1962 | 64.162854 |
| 19 | Albania | 2007 | 76.470293 |
| 20 | Algeria | 1962 | 47.045000 |
df_lei_pivot = df_lei.pivot(index='country', columns='year', values='le')
df_lei_pivot.reset_index(inplace=True)
df_lei_pivot['lei'] = df_lei_pivot[2007] - df_lei_pivot[1962]
df_lei_pivot = df_lei_pivot[~df_lei_pivot['lei'].isnull()]
df_lei_pivot = df_lei_pivot.sort_values('lei', ascending=False)
highest_lei_increase = df_lei_pivot['lei'].max()
print(f'highest life expectancy increase: {highest_lei_increase}')
highest_lei_countries = (df_lei_pivot.loc[df_lei_pivot['lei']==highest_lei_increase])['country'].values.tolist()
print(f'country (or countries) has shown the greatest increase in Life expectancy at birth between 1962 and 2007: {highest_lei_countries}')
fig_lei = px.bar(df_lei_pivot, x = 'country', y ='lei')
pio.renderers.default = "plotly_mimetype+notebook"
fig_lei.show()
highest life expectancy increase: 36.916146341499996 country (or countries) has shown the greatest increase in Life expectancy at birth between 1962 and 2007: ['Maldives']
# export notebook to html
os.system('jupyter nbconvert --to html python_for_data_science.ipynb')
[NbConvertApp] Converting notebook python_for_data_science.ipynb to html [NbConvertApp] Writing 4465485 bytes to python_for_data_science.html
0